This notebook is based off the original fragment detection notebook, but specific to detection of participle phrase fragments. As our trainin g data we will use a datafile of 50,000+ sentences with a subordinate clause contained in them at the begining, middle, or end of the sentence, and 50,000+ subordinate clauses extracted from the sentences -- these raw subordinate clauses will always be fragments. The labels will be either a 1 or 0, where 1 indicates a subordinate clause fragment and 0 indicates that it is NOT a subordinate clause fragment (it does not mean it is a sentence).
In [ ]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tflearn
from tflearn.data_utils import to_categorical
import spacy
nlp = spacy.load('en')
import re
from nltk.util import ngrams, trigrams
import csv
In [ ]:
import subprocess
subprocess.Popen("python combine.py childrens_fragments".split(), cwd='../data/fragments/subordinate-clauses')
In [ ]:
texts = []
labels = []
with open("../data/fragments/subordinate-clauses/childrens_fragments.combined.txt","r") as f:
for i, sentence_or_fragment in enumerate(f):
if i % 2 == 0:
labels.append(0)
else:
labels.append(1)
texts.append(sentence_or_fragment.strip())
print(texts[-10:])
In [ ]:
import random
combined = list(zip(texts,labels))
random.shuffle(combined)
texts[:], labels[:] = zip(*combined)
print(texts[-10:])
print(labels[-10:])
In [ ]:
def textStringToPOSArray(text):
doc = nlp(text)
tags = []
for word in doc:
tags.append(word.tag_)
return tags
textStringToPOSArray(texts[3])
In [ ]:
def find_ngrams(input_list, n):
return zip(*[input_list[i:] for i in range(n)])
def getPOSTrigramsForTextString(text):
tags = textStringToPOSArray(text)
tgrams = list(trigrams(tags))
return tgrams
print("Text: ", texts[3], labels[3])
getPOSTrigramsForTextString(texts[3])
In [ ]:
def trigramsToDictKeys(trigrams):
keys = []
for trigram in trigrams:
keys.append('>'.join(trigram))
return keys
print(texts[2])
print(trigramsToDictKeys(getPOSTrigramsForTextString(texts[2])))
In [ ]:
from collections import Counter
c = Counter()
for textString in texts:
c.update(trigramsToDictKeys(getPOSTrigramsForTextString(textString)))
total_counts = c
print("Total words in data set: ", len(total_counts))
In [ ]:
vocab = sorted(total_counts, key=total_counts.get, reverse=True)
print(vocab[:60])
In [ ]:
print(vocab[-1], ': ', total_counts[vocab[-1]])
Take the trigrams and index them
In [ ]:
word2idx = {n: i for i, n in enumerate(vocab)}## create the word-to-index dictionary here
print(word2idx)
In [ ]:
def textToTrigrams(text):
return trigramsToDictKeys(getPOSTrigramsForTextString(text))
def text_to_vector(text):
wordVector = np.zeros(len(vocab))
for word in textToTrigrams(text):
index = word2idx.get(word, None)
if index != None:
wordVector[index] += 1
return wordVector
In [ ]:
text_to_vector('Until I died, I laughed')[:65]
In [ ]:
word_vectors = np.zeros((len(texts), len(vocab)), dtype=np.int_)
for ii, text in enumerate(texts):
word_vectors[ii] = text_to_vector(text)
In [ ]:
# Printing out the first 5 word vectors
word_vectors[:5, :23]
In [ ]:
records = len(labels)
test_fraction = 0.9
train_split, test_split = int(records*test_fraction), int(records*(1-test_fraction))
print(train_split, test_split)
trainX, trainY = word_vectors[:train_split], to_categorical(labels[:train_split], 2)
testX, testY = word_vectors[test_split:], to_categorical(labels[test_split:], 2)
In [ ]:
trainX[-1], trainY[-1]
In [ ]:
len(trainY), len(testY), len(trainY) + len(testY)
In [ ]:
# Network building
def build_model():
# This resets all parameters and variables, leave this here
tf.reset_default_graph()
#### Your code ####
net = tflearn.input_data([None, len(vocab)]) # Input
net = tflearn.fully_connected(net, 200, activation='ReLU') # Hidden
net = tflearn.fully_connected(net, 25, activation='ReLU') # Hidden
net = tflearn.fully_connected(net, 2, activation='softmax') # Output
net = tflearn.regression(net, optimizer='sgd', learning_rate=0.1, loss='categorical_crossentropy')
model = tflearn.DNN(net)
return model
In [ ]:
len(vocab)
In [ ]:
model = build_model()
In [ ]:
# Training
model.fit(trainX, trainY, validation_set=0.1, show_metric=True, batch_size=128, n_epoch=50)
In [ ]:
# Testing
predictions = (np.array(model.predict(testX))[:,0] >= 0.5).astype(np.int_)
test_accuracy = np.mean(predictions == testY[:,0], axis=0)
print("Test accuracy: ", test_accuracy)
In [ ]:
w = csv.writer(open("../models/subordinatevocabindex.csv", "w"))
for key, val in word2idx.items():
w.writerow([key, val])
In [ ]:
model.save("../models/subordinate_model.tfl")
In [ ]:
def test_sentence(sentence):
positive_prob = model.predict([text_to_vector(sentence)])[0][1]
print('Is this a subordinate clause fragment?\n {}'.format(sentence))
print('P(positive) = {:.3f} :'.format(positive_prob),
'Yes' if positive_prob > 0.5 else 'No')
In [ ]:
test_sentence("Until the end of time.")
In [ ]:
test_sentence("She would love him until the end of time.")
In [ ]:
test_sentence("Until the end of time, she would love him.")
In [ ]:
test_sentence("Ryan, in the dead of night, arrived on the banks of the Delaware.")
In [ ]:
test_sentence("In the dead of night.")
In [ ]:
test_sentence("In the dead of night, Ryan arrived on the banks of the Delaware.")
In [ ]:
test_sentence("Ryan arrived on the banks of the Delaware in the dead of night.")
In [ ]:
test_sentence("At the end of her rope.")
In [ ]:
test_sentence("Cindy was at the end of her rope.")
In [ ]:
test_sentence("Cindy was done, at the end of her rope.")
In [ ]:
test_sentence("On the iron throne, Joffry looked rather fat.")
In [ ]:
test_sentence("On the iron throne.")
In [ ]:
test_sentence("Unless Christine finishes her calculus homework.")
In [ ]:
test_sentence("Unless Christine finishes her calculus homework, she will have to suffer Mr. Nguyen's wrath in class tomorrow.")
In [ ]:
test_sentence("Because her best friend Giselle insisted on gossiping during their study session the night before.")
In [ ]:
test_sentence("While Bailey slept on the sofa in front of the television.")
In [ ]:
test_sentence("While Bailey slept on the sofa in front of the television, Samson, the family dog, gnawed on the leg of the coffee table.")
In [ ]:
test_sentence("Tanya did poorly on her history exam because her best friend Giselle insisted on gossiping during their study session the night before.")
Save the vocab
In [ ]:
vocab
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: